{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# S3/R2 Storage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:numexpr.utils:Note: NumExpr detected 32 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n",
      "Note: NumExpr detected 32 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n",
      "INFO:numexpr.utils:NumExpr defaulting to 8 threads.\n",
      "NumExpr defaulting to 8 threads.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/hua/code/llama_index/.hermit/python/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import logging\n",
    "import sys\n",
    "\n",
    "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",
    "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n",
    "\n",
    "from llama_index import (\n",
    "    VectorStoreIndex,\n",
    "    SimpleDirectoryReader,\n",
    "    load_index_from_storage,\n",
    "    StorageContext,\n",
    ")\n",
    "from IPython.display import Markdown, display"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import dotenv\n",
    "import s3fs\n",
    "import os\n",
    "\n",
    "dotenv.load_dotenv(\"../../../.env\")\n",
    "\n",
    "AWS_KEY = os.environ[\"AWS_ACCESS_KEY_ID\"]\n",
    "AWS_SECRET = os.environ[\"AWS_SECRET_ACCESS_KEY\"]\n",
    "R2_ACCOUNT_ID = os.environ[\"R2_ACCOUNT_ID\"]\n",
    "\n",
    "assert AWS_KEY is not None and AWS_KEY != \"\"\n",
    "\n",
    "s3 = s3fs.S3FileSystem(\n",
    "    key=AWS_KEY,\n",
    "    secret=AWS_SECRET,\n",
    "    endpoint_url=f\"https://{R2_ACCOUNT_ID}.r2.cloudflarestorage.com\",\n",
    "    s3_additional_kwargs={\"ACL\": \"public-read\"},\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\n"
     ]
    }
   ],
   "source": [
    "# load documents\n",
    "documents = SimpleDirectoryReader(\n",
    "    \"../../../examples/paul_graham_essay/data/\"\n",
    ").load_data()\n",
    "print(len(documents))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 20729 tokens\n",
      "> [build_index_from_nodes] Total embedding token usage: 20729 tokens\n"
     ]
    }
   ],
   "source": [
    "index = VectorStoreIndex.from_documents(documents, fs=s3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save index to disk\n",
    "index.set_index_id(\"vector_index\")\n",
    "index.storage_context.persist(\"llama-index/storage_demo\", fs=s3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'Key': 'llama-index/storage_demo/docstore.json',\n",
       "  'LastModified': datetime.datetime(2023, 5, 14, 20, 23, 53, 213000, tzinfo=tzutc()),\n",
       "  'ETag': '\"3993f79a6f7cf908a8e53450a2876cf0\"',\n",
       "  'Size': 107529,\n",
       "  'StorageClass': 'STANDARD',\n",
       "  'type': 'file',\n",
       "  'size': 107529,\n",
       "  'name': 'llama-index/storage_demo/docstore.json'},\n",
       " {'Key': 'llama-index/storage_demo/index_store.json',\n",
       "  'LastModified': datetime.datetime(2023, 5, 14, 20, 23, 53, 783000, tzinfo=tzutc()),\n",
       "  'ETag': '\"5b084883bf0b08e3c2b979af7c16be43\"',\n",
       "  'Size': 3105,\n",
       "  'StorageClass': 'STANDARD',\n",
       "  'type': 'file',\n",
       "  'size': 3105,\n",
       "  'name': 'llama-index/storage_demo/index_store.json'},\n",
       " {'Key': 'llama-index/storage_demo/vector_store.json',\n",
       "  'LastModified': datetime.datetime(2023, 5, 14, 20, 23, 54, 232000, tzinfo=tzutc()),\n",
       "  'ETag': '\"75535cf22c23bcd8ead21b8a52e9517a\"',\n",
       "  'Size': 829290,\n",
       "  'StorageClass': 'STANDARD',\n",
       "  'type': 'file',\n",
       "  'size': 829290,\n",
       "  'name': 'llama-index/storage_demo/vector_store.json'}]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s3.listdir(\"llama-index/storage_demo\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load index from s3\n",
    "sc = StorageContext.from_defaults(persist_dir=\"llama-index/storage_demo\", fs=s3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.indices.loading:Loading indices with ids: ['vector_index']\n",
      "Loading indices with ids: ['vector_index']\n"
     ]
    }
   ],
   "source": [
    "index2 = load_index_from_storage(sc, \"vector_index\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['f8891670-813b-4cfa-9025-fcdc8ba73449', '985a2c69-9da5-40cf-ba30-f984921187c1', 'c55f077c-0bfb-4036-910c-6fd5f26f7372', 'b47face6-f25b-4381-bb8d-164f179d6888', '16304ef7-2378-4776-b86d-e8ed64c8fb58', '62dfdc7a-6a2f-4d5f-9033-851fbc56c14a', 'a51ef189-3924-494b-84cf-e23df673e29c', 'f94aca2b-34ac-4ec4-ac41-d31cd3b7646f', 'ad89e2fb-e0fc-4615-a380-8245bd6546af', '3dbba979-ca08-4321-b4de-be5236ac2e11', '634b2d6d-0bff-4384-898f-b521470db8ac', 'ee9551ba-7a44-493d-997b-8eeab9c04e25', 'b21fe2b5-d8e3-4895-8424-fa9e3da76711', 'bd2609e8-8b52-49e8-8ee7-41b64b3ce9e1', 'a08b739e-efd9-4a61-8517-c4f9cea8cf7d', '8d4babaf-37f1-454a-8be4-b67e1b8e428f', '05389153-4567-4e53-a2ea-bc3e020ee1b2', 'd29531a5-c5d2-4e1d-ab99-56f2b4bb7f37', '2ccb3c63-3407-4acf-b5bb-045caa588bbc', 'a0b1bebb-3dcd-4bf8-9ebb-a4cd2cb82d53', '21517b34-6c1b-4607-bf89-7ab59b85fba6', 'f2487d52-1e5e-4482-a182-218680ef306e', '979998ce-39ee-41bc-a9be-b3ed68d7c304', '3e658f36-a13e-407a-8624-0adf9e842676'])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index2.docstore.docs.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
